library(dplyr) 
library(ggplot2)
library(tidyverse)
library(zoo)
library(knitr)
library(readxl)

url = 'https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv'
read_csv(url)
## # A tibble: 440,745 x 6
##    date       county      state      fips  cases deaths
##    <date>     <chr>       <chr>      <chr> <dbl>  <dbl>
##  1 2020-01-21 Snohomish   Washington 53061     1      0
##  2 2020-01-22 Snohomish   Washington 53061     1      0
##  3 2020-01-23 Snohomish   Washington 53061     1      0
##  4 2020-01-24 Cook        Illinois   17031     1      0
##  5 2020-01-24 Snohomish   Washington 53061     1      0
##  6 2020-01-25 Orange      California 06059     1      0
##  7 2020-01-25 Cook        Illinois   17031     1      0
##  8 2020-01-25 Snohomish   Washington 53061     1      0
##  9 2020-01-26 Maricopa    Arizona    04013     1      0
## 10 2020-01-26 Los Angeles California 06037     1      0
## # … with 440,735 more rows
source = "/Users/xingxin/Github/geog176a-lab2/data/"


covid<-read_csv(url)
head(covid)
## # A tibble: 6 x 6
##   date       county    state      fips  cases deaths
##   <date>     <chr>     <chr>      <chr> <dbl>  <dbl>
## 1 2020-01-21 Snohomish Washington 53061     1      0
## 2 2020-01-22 Snohomish Washington 53061     1      0
## 3 2020-01-23 Snohomish Washington 53061     1      0
## 4 2020-01-24 Cook      Illinois   17031     1      0
## 5 2020-01-24 Snohomish Washington 53061     1      0
## 6 2020-01-25 Orange    California 06059     1      0

Question 1

#  California, the daily new cases
dat = covid %>% 
  filter(state == "California") %>% 
  group_by(county) %>% 
  mutate(newCases = c(NA, diff(cases))) %>% 
  ungroup() %>% 
  filter(date == max(date)) 
#  the 5 counties with the most cumulative cases
most_cum_cases = dat %>% 
  slice_max(cases, n = 5) %>% 
  select(county, cases)

knitr::kable(most_cum_cases, 
             caption = "Most Cumulative Cases California Counties",
             col.names = c("County", "cumulative Cases"))
Most Cumulative Cases California Counties
County cumulative Cases
Los Angeles 221950
Riverside 46720
Orange 43709
San Bernardino 41231
San Diego 34741
# the 5 counties with the most NEW cases
most_new_cases = dat %>% 
  slice_max(newCases, n = 5) %>% 
  select(county, newCases)

knitr::kable(most_new_cases, 
             caption = "Most New Cases California Counties",
             col.names = c("County", "New Cases"))
Most New Cases California Counties
County New Cases
Los Angeles 1188
San Bernardino 789
Fresno 703
Sacramento 373
Orange 342
# import population data
PopulationEstimates <- read_excel(paste(source,"PopulationEstimates.xls",sep = ""), 
                                  skip = 2)
population_2019 = PopulationEstimates %>% 
  select(FIPStxt,POP_ESTIMATE_2019) 

# join the population data to the California COVID data.  
dat2 <-inner_join(dat, population_2019, by = c("fips" = "FIPStxt"))
#  the 5 counties with the most cumulative cases per capita
most_cum_cases_pc = dat2 %>%
  mutate(Cases_pc = cases/POP_ESTIMATE_2019) %>%
  slice_max(Cases_pc, n = 5) %>% 
  select(county, Cases_pc)

knitr::kable(most_cum_cases_pc, 
             caption = "Most Cumulative Cases California Counties Per Capita",
             col.names = c("County", "cumulative Cases Per Capita"))
Most Cumulative Cases California Counties Per Capita
County cumulative Cases Per Capita
Imperial 0.0555031
Kings 0.0360534
Kern 0.0295156
Tulare 0.0263023
Merced 0.0244058
# the 5 counties with the most NEW cases per capita
most_new_cases_pc = dat2 %>% 
  mutate(newCases_pc = newCases/POP_ESTIMATE_2019) %>%
  slice_max(newCases_pc, n = 5) %>% 
  select(county, newCases_pc)

knitr::kable(most_new_cases_pc, 
             caption = "Most New Cases California Counties Per Capit",
             col.names = c("County", "New Cases Per Capit"))
Most New Cases California Counties Per Capit
County New Cases Per Capit
Monterey 0.0007833
Fresno 0.0007036
Kings 0.0006865
Inyo 0.0004435
Sutter 0.0004434
# 14-days 
per_cap = covid %>% 
  filter(state == "California") %>% 
  group_by(county) %>% 
  mutate(newCases = c(NA, diff(cases))) %>% 
  ungroup() %>%
  filter(date >= max(date)-13)  %>%
  inner_join(population_2019, by = c("fips" = "FIPStxt"))

#Calculate Safe counties
dat_14 = per_cap %>%
  group_by(county,POP_ESTIMATE_2019) %>%
  summarize(tot_newCases = sum(newCases, na.rm = TRUE),tot_cumCases = sum(cases, na.rm = TRUE)) %>%  
  ungroup() %>%
  mutate(tot_safe_county = tot_newCases/(POP_ESTIMATE_2019/100000)) %>% 
  filter(tot_safe_county < 100)

Results of California

  1. the total number of cases: 6.2850810^{5}

  2. the total number of new cases: 6527

  3. the total number of the total number of safe counties : 12

Question 2

# facet plot of the daily new cases and the 7-day rolling mean
# select four states
four_states <- covid %>%
  filter(state %in% c("New York","California", "Louisiana", "Florida")) %>%
  group_by(state, date) %>%
  summarize(cases = sum(cases)) %>% 
  ungroup() %>% 
  group_by(state) %>% 
  mutate(newCases = cases - lag(cases),
         roll7 = rollmean(newCases, 7, fill = NA, align = 'right')) %>%
  ungroup() %>%
  filter(newCases > 0)

  ggplot(data = four_states,aes(x = date)) +
  geom_col(aes(y = newCases), col = NA, fill = "#F5B8B5") +  
  geom_line(aes(y = roll7), col = "darkred", size = 1) +  
  labs(x = "Date", y = "Daily New Cases",
       title = "New Reported Cases By Day ") +
  theme(plot.background = element_rect(fill = "white"),  
        panel.background = element_rect(fill = "white"),  
        plot.title = element_text(size = 14, face = 'bold')) +  
  facet_grid(~state, scales = "free_y")

# import population data
PopulationEstimates <- read_excel(paste(source,"PopulationEstimates.xls",sep = ""), 
                                  skip = 2)
population_2019 = PopulationEstimates %>% 
  select(Area_Name,POP_ESTIMATE_2019) 
# facet plot of the daily new cases and the 7-day rolling mean
# select four states per capita
four_states_pc <- population_2019 %>%
  right_join(four_states, by = c("Area_Name" = "state")) %>%
  mutate(newCases_pc = (newCases/POP_ESTIMATE_2019), 
         roll7 = rollmean(newCases_pc, 7, fill = NA, align = 'right')) %>%
  ungroup() %>%
  filter(newCases_pc > 0)

  ggplot(data = four_states_pc,aes(x = date)) +
  geom_col(aes(y = newCases_pc), col = NA, fill = "skyblue") +  
  geom_line(aes(y = roll7), col = "darkblue", size = 1) +  
  labs(x = "Date", y = "Daily New Cases Per Capita",
       title = "New Reported Cases Per Capita By Day ") +
  theme(plot.background = element_rect(fill = "white"),  
        panel.background = element_rect(fill = "white"),  
        plot.title = element_text(size = 14, face = 'bold')) +  
  facet_grid(~Area_Name, scales = "free_y") 

The absolute value of new cases in Louisiana is much lower than the other three states. However, the new cases per capita in Louisiana is close to the other states. The possible reason is that Louisiana has a much smaller populaition compared with other states.